#Loading libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(broom)
# Loading data
marathon_results<-read_csv("/Users/prathyushabhuma/Documents/Florida Polytechnic University/Data Visualization and Reproducible Research/dataviz_final_project/data/marathon_results_2017.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 26410 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Bib, Name, M/F, City, State, Country, 10K, 15K, 20K, Proj Time
## dbl (4): Age, Overall, Gender, Division
## time (8): 5K, Half, 25K, 30K, 35K, 40K, Pace, Official Time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#dimensions
dim(marathon_results)
## [1] 26410 22
head(marathon_results)
# Display the structure of the dataset
str(marathon_results)
## spc_tbl_ [26,410 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Bib : chr [1:26410] "11" "17" "23" "21" ...
## $ Name : chr [1:26410] "Kirui, Geoffrey" "Rupp, Galen" "Osako, Suguru" "Biwott, Shadrack" ...
## $ Age : num [1:26410] 24 30 25 32 31 40 33 28 27 28 ...
## $ M/F : chr [1:26410] "M" "M" "M" "M" ...
## $ City : chr [1:26410] "Keringet" "Portland" "Machida-City" "Mammoth Lakes" ...
## $ State : chr [1:26410] NA "OR" NA "CA" ...
## $ Country : chr [1:26410] "KEN" "USA" "JPN" "USA" ...
## $ 5K : 'hms' num [1:26410] 00:15:25 00:15:24 00:15:25 00:15:25 ...
## ..- attr(*, "units")= chr "secs"
## $ 10K : chr [1:26410] "0:30:28" "0:30:27" "0:30:29" "0:30:29" ...
## $ 15K : chr [1:26410] "0:45:44" "0:45:44" "0:45:44" "0:45:44" ...
## $ 20K : chr [1:26410] "1:01:15" "1:01:15" "1:01:16" "1:01:19" ...
## $ Half : 'hms' num [1:26410] 01:04:35 01:04:35 01:04:36 01:04:45 ...
## ..- attr(*, "units")= chr "secs"
## $ 25K : 'hms' num [1:26410] 01:16:59 01:16:59 01:17:00 01:17:00 ...
## ..- attr(*, "units")= chr "secs"
## $ 30K : 'hms' num [1:26410] 01:33:01 01:33:01 01:33:01 01:33:01 ...
## ..- attr(*, "units")= chr "secs"
## $ 35K : 'hms' num [1:26410] 01:48:19 01:48:19 01:48:31 01:48:58 ...
## ..- attr(*, "units")= chr "secs"
## $ 40K : 'hms' num [1:26410] 02:02:53 02:03:14 02:03:38 02:04:35 ...
## ..- attr(*, "units")= chr "secs"
## $ Pace : 'hms' num [1:26410] 00:04:57 00:04:58 00:04:59 00:05:03 ...
## ..- attr(*, "units")= chr "secs"
## $ Proj Time : chr [1:26410] "-" "-" "-" "-" ...
## $ Official Time: 'hms' num [1:26410] 02:09:37 02:09:58 02:10:28 02:12:08 ...
## ..- attr(*, "units")= chr "secs"
## $ Overall : num [1:26410] 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : num [1:26410] 1 2 3 4 5 6 7 8 9 10 ...
## $ Division : num [1:26410] 1 2 3 4 5 1 6 7 8 9 ...
## - attr(*, "spec")=
## .. cols(
## .. Bib = col_character(),
## .. Name = col_character(),
## .. Age = col_double(),
## .. `M/F` = col_character(),
## .. City = col_character(),
## .. State = col_character(),
## .. Country = col_character(),
## .. `5K` = col_time(format = ""),
## .. `10K` = col_character(),
## .. `15K` = col_character(),
## .. `20K` = col_character(),
## .. Half = col_time(format = ""),
## .. `25K` = col_time(format = ""),
## .. `30K` = col_time(format = ""),
## .. `35K` = col_time(format = ""),
## .. `40K` = col_time(format = ""),
## .. Pace = col_time(format = ""),
## .. `Proj Time` = col_character(),
## .. `Official Time` = col_time(format = ""),
## .. Overall = col_double(),
## .. Gender = col_double(),
## .. Division = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Display the summary of the dataset
summary(marathon_results)
## Bib Name Age M/F
## Length:26410 Length:26410 Min. :18.00 Length:26410
## Class :character Class :character 1st Qu.:34.00 Class :character
## Mode :character Mode :character Median :43.00 Mode :character
## Mean :42.59
## 3rd Qu.:51.00
## Max. :84.00
## City State Country 5K
## Length:26410 Length:26410 Length:26410 Length:26410
## Class :character Class :character Class :character Class1:hms
## Mode :character Mode :character Mode :character Class2:difftime
## Mode :numeric
##
##
## 10K 15K 20K Half
## Length:26410 Length:26410 Length:26410 Length:26410
## Class :character Class :character Class :character Class1:hms
## Mode :character Mode :character Mode :character Class2:difftime
## Mode :numeric
##
##
## 25K 30K 35K 40K
## Length:26410 Length:26410 Length:26410 Length:26410
## Class1:hms Class1:hms Class1:hms Class1:hms
## Class2:difftime Class2:difftime Class2:difftime Class2:difftime
## Mode :numeric Mode :numeric Mode :numeric Mode :numeric
##
##
## Pace Proj Time Official Time Overall
## Length:26410 Length:26410 Length:26410 Min. : 1
## Class1:hms Class :character Class1:hms 1st Qu.: 6604
## Class2:difftime Mode :character Class2:difftime Median :13206
## Mode :numeric Mode :numeric Mean :13206
## 3rd Qu.:19809
## Max. :26411
## Gender Division
## Min. : 1 Min. : 1
## 1st Qu.: 3302 1st Qu.: 502
## Median : 6604 Median :1154
## Mean : 6661 Mean :1589
## 3rd Qu.: 9905 3rd Qu.:2191
## Max. :14438 Max. :5846
colSums(is.na(marathon_results))
## Bib Name Age M/F City
## 0 0 0 0 0
## State Country 5K 10K 15K
## 3595 0 25 0 0
## 20K Half 25K 30K 35K
## 0 17 40 25 23
## 40K Pace Proj Time Official Time Overall
## 6 0 0 0 0
## Gender Division
## 0 0
# 1. Interactive Plot
# Plot distribution of marathon finish times
finish_time_plot <- ggplot(marathon_results, aes(x = `Official Time`)) +
geom_histogram(binwidth = 300, fill = "blue", color = "white") +
labs(title = "Distribution of Marathon Finish Times", x = "Finish Time", y = "Count")
finish_time_plot
# Convert to interactive plot using plotly
interactive_finish_time_plot <- ggplotly(finish_time_plot)
interactive_finish_time_plot
# Save the interactive plot as an HTML file
htmlwidgets::saveWidget(interactive_finish_time_plot, "interactive_finish_time_plot.html")
`
# Load world shapefile from Natural Earth
# https://www.naturalearthdata.com/downloads/110m-cultural-vectors/
world_shapes <- read_sf("/Users/prathyushabhuma/Documents/Florida Polytechnic University/Data Visualization and Reproducible Research/MiniProject2/Data/ne_110m_admin_0_countries")
head(world_shapes)
# Create a map of Participants origins
# We'll map the count of Participants from each country if available
country_count <- marathon_results %>%
group_by(Country) %>%
summarize(users = n())
world_shapes <- world_shapes %>%
mutate(join_key = if_else(ISO_A3_EH == "NOR", SU_A3, ISO_A3_EH))
country_counts <- country_count %>%
mutate(join_key = if_else(Country == "NOR", "NOR", Country))
map_data <- left_join(world_shapes, country_counts, by = c("SU_A3" = "join_key"))
# Plot the map with tmaps
ggplot(map_data) +
geom_sf(aes(fill = users)) +
scale_fill_gradient(low = "lightgreen", high = "orange", na.value = "lightgray", name = "Participants") +
ggtitle("Marathon Participants by Country") +
theme_minimal() +
theme(plot.title = element_text(size = 18, face = "bold"),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 10),
legend.key.size = unit(2, 'cm'),
legend.key.height = unit(1, 'cm'),
legend.key.width = unit(1, 'cm'),
legend.position = "bottom",
panel.grid = element_blank(),
axis.text = element_blank(),
axis.title = element_blank())
## Linear model predicting finish time based on age and gender
# Convert 'Official Time' to numeric (total minutes)
marathon_results <- marathon_results %>%
mutate(Official_Time_Minutes = as.numeric(hms::as_hms(`Official Time`)) / 60) %>%
mutate(`M/F` = as.factor(`M/F`))
# Fit the linear model predicting finish time based on age and gender
lm_model <- lm(Official_Time_Minutes ~ Age + `M/F`, data = marathon_results)
# Display the model summary
summary(lm_model)
##
## Call:
## lm(formula = Official_Time_Minutes ~ Age + `M/F`, data = marathon_results)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.085 -27.548 -9.507 18.620 245.891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 206.61783 0.93404 221.21 <2e-16 ***
## Age 1.06309 0.02159 49.23 <2e-16 ***
## `M/F`M -25.30338 0.49531 -51.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.18 on 26407 degrees of freedom
## Multiple R-squared: 0.1361, Adjusted R-squared: 0.136
## F-statistic: 2080 on 2 and 26407 DF, p-value: < 2.2e-16
# Plot model coefficients
coef_plot <- tidy(lm_model) %>%
ggplot(aes(x = term, y = estimate, ymin = estimate - std.error, ymax = estimate + std.error)) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.5) +
geom_errorbar(width = 0.2, color = "black") +
labs(title = "Linear Model Coefficients", x = "Term", y = "Estimate") +
theme_minimal()
# Display the plot
print(coef_plot)
table(marathon_results$`M/F`)
##
## F M
## 11972 14438
# Extension to the original mini project that was submitted earlier based on the feedback provided
# Displaying the summary of the linear model
summary(lm_model)
##
## Call:
## lm(formula = Official_Time_Minutes ~ Age + `M/F`, data = marathon_results)
##
## Residuals:
## Min 1Q Median 3Q Max
## -104.085 -27.548 -9.507 18.620 245.891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 206.61783 0.93404 221.21 <2e-16 ***
## Age 1.06309 0.02159 49.23 <2e-16 ***
## `M/F`M -25.30338 0.49531 -51.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.18 on 26407 degrees of freedom
## Multiple R-squared: 0.1361, Adjusted R-squared: 0.136
## F-statistic: 2080 on 2 and 26407 DF, p-value: < 2.2e-16
# Plotting diagnostic plots to check the assumptions of the linear model
par(mfrow = c(2, 2)) # Arrange plots in a 2x2 grid
plot(lm_model)
par(mfrow = c(1, 1)) # Reset plot layout
# Visualizing actual vs. predicted values
marathon_results$predicted_time <- predict(lm_model, marathon_results)
ggplot(marathon_results, aes(x = Age, y = Official_Time_Minutes, color = `M/F`)) +
geom_point(alpha = 0.5) +
geom_line(aes(y = predicted_time), color = "black", linetype = "dashed") +
labs(title = "Actual vs. Predicted Finishing Time by Age and Gender",
x = "Age",
y = "Finishing Time (Minutes)") +
scale_color_manual(values = c("M" = "blue", "F" = "pink")) +
theme_minimal()
Comment: The scatter plot shows the relationship between the official finishing times (in minutes) and the ages of marathon participants, which distinguishes the gender of male and female runners by color. Blue dots represent male finishers, while pink dots represent female finishers. The black dashed line illustrates the linear regression model’s predicted finishing times based on age and gender. The plot reveals that finishing times tend to increase with age, and on average, males generally have faster finishing times compared to females. The spread of data indicates variability in finishing times across different ages and genders, highlighting the complex factors influencing marathon performance.